Analysis OverviewΒΆ

Oakland, California is a large urban city located in the East Bay portion of the greater San Francisco Bay region. Oakland is bracketed on west by San Francisco Bay and to the east by the Oakland Hills. Large portions of urbanized Oakland now exist in areas that used to be within the bay margin or tidal marsh. The hills to the east were dominated coastal Redwood forests and similar landcover. Development of the city over the past 150 years has drastically altered the landscape.

Currently, the city has a number of programs focused on urban greenspace including "Green Streets and Raingardens" and "Greenspace and Carbon Removal". Portions of east Oakland adjacent to the Oakland Hills are very affluent. More urban parts of the city are lower on the socioeconomic spectrum.

In this analysis, we will compare tree canopy cover as measured by NDVI against median income. We will use ordinary linear regression to attempt to identify a relationship between these two factors.

InΒ [1]:
import os
import pathlib

import census
import earthpy as et
import geopandas as gpd
import geoviews as gv
import gitpass
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
import numpy as np
import pandas as pd
import pystac_client
import requests
import rioxarray as rxr
import rioxarray.merge as rxrm
import shapely
import sklearn
import us
import xarray as xr
import xrspatial
from census import Census
from us import states
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

data_dir = os.path.join(et.io.HOME, et.io.DATA_NAME)

for a_dir in [data_dir]:
    if not os.path.exists(a_dir):
        os.makedirs(a_dir)
InΒ [2]:
oak_bound_path = os.path.join(
    data_dir,
    'oak_bound',
    'cityboundary.shp'
)

# Download URL is a redirect to the zip file so using this method
# instead of et.data.get_data
oak_bound_url = ('https://data.oaklandca.gov/api/geospatial/'
                 '9bhq-yt6w?method=export&format=Shapefile'
                )

def download_zip_from_redirect(url, save_path, file_name):
    """
    Get the file from a re-direct download path

    Parameters
    ==========
    url: URL
    the URL to download the file
    
    save_path: file path
    The file path to save the file to
    
    file_name: file name
    The name of the file to save

    Returns
    =======
    file_path: file path
    The file path of the downloaded file.
    """
    try:
        # Follow redirects and get the final URL
        final_url = requests.head(url, allow_redirects=True).url

        # Download the ZIP file from the final URL
        response = requests.get(final_url, stream=True)
        response.raise_for_status()

        # Specify the path where you want to save the file
        file_name = file_name + ".zip"
        file_path = os.path.join(save_path, file_name)

        # Save the content to the specified path
        with open(str(file_path), "wb") as file:  # Convert file_path to string explicitly
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print(f"File downloaded successfully to: {file_path}")
        return file_path
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file: {e}")
        return None

# Download the boundary    
    
if not os.path.exists(oak_bound_path):
    print('downloading ' + oak_bound_url)    
    download_zip_from_redirect(oak_bound_url, data_dir, "oakland_boundary.shp")
    
oak_bound_gdf = gpd.read_file(oak_bound_url).to_crs(4326)
oak_bound_gdf.hvplot(aspect='equal')
downloading https://data.oaklandca.gov/api/geospatial/9bhq-yt6w?method=export&format=Shapefile
File downloaded successfully to: C:\Users\Pete\earth-analytics\data\oakland_boundary.shp.zip
Out[2]:
InΒ [3]:
# Download Census Tracts within Chicago Boundary
ca_tract_path = os.path.join(
    data_dir,
    'earthpy-downloads',
    'tl_2023_06_tract',
    'tl_2023_06_tract.shp'
)
ca_census_url = ('https://www2.census.gov/geo/tiger/'
                 'TIGER2023/TRACT/tl_2023_06_tract.zip'
                )
if not os.path.exists(ca_tract_path):
    print('downloading ' + ca_census_url)
    ca_fips = us.states.CA.fips
    ca_tract_shp = et.data.get_data(url=ca_census_url)
else:
    print("CA census tract data already exists")

ca_tract_gdf = gpd.read_file(ca_tract_path).to_crs(4326)

#Select tracts that intersect with the Chicago boundary

oak_tract_gdf = gpd.sjoin(ca_tract_gdf, oak_bound_gdf, predicate='within')
oak_tract_gdf.hvplot(aspect='equal') * oak_bound_gdf.hvplot(aspect='equal')
CA census tract data already exists
Out[3]:
InΒ [4]:
oak_tract_gdf = gpd.sjoin(ca_tract_gdf, oak_bound_gdf, predicate='intersects')
oak_tract_gdf.hvplot(aspect='equal') * oak_bound_gdf.hvplot(aspect='equal')
Out[4]:
InΒ [5]:
# Clip census tract boundaries to San Francisco Bay edge

bay_path = os.path.join(
    data_dir,
    'bay_shp',
    'Lake_Michigan_Shoreline.shp'
)

bay_url = ('https://spatial.lib.berkeley.edu/public/ark28722-s7d02x/data.zip')
# Download geometry for Lake Michigan

# if not os.path.exists(bay_path):
#     print('downloading ' + bay_url)    
#     #download_zip_from_redirect(bay_url, data_dir, 'bay_shp')
#     sf_bay_gdf = gpd.read_file(bay_url)
# else:
#     print("Bay data already exists")
    
bay_gdf = gpd.read_file(bay_url).to_crs(4326)
bay_gdf = bay_gdf[bay_gdf['OBJECTID'] == 3]

# Clip the tract polygons to the edge of the lake geometry

tract_clip = gpd.overlay(oak_tract_gdf, bay_gdf, how='difference')
tract_clip.hvplot(aspect='equal')
Out[5]:
InΒ [7]:
# Download census data

# Set API key
api_key = gitpass.gitpass('US Census API Key')
c = Census(api_key)

census_fields = [
    'NAME',
    'B06011_001E'
]

state_code = us.states.CA.fips
county_code = '001'
#tract_list = list(oak_tract_gdf['NAME_left'])
#first_50_values = tract_list[:49]

tract_table = c.acs5.state_county_tract(fields = census_fields,
                                 state_fips = state_code,
                                 county_fips = county_code,
                                 tract = '*',
                                 year=2021)
tract_df = pd.DataFrame(tract_table)

tract_df
Out[7]:
NAME B06011_001E state county tract
0 Census Tract 4001, Alameda County, California 105875.0 06 001 400100
1 Census Tract 4002, Alameda County, California 98688.0 06 001 400200
2 Census Tract 4003, Alameda County, California 79947.0 06 001 400300
3 Census Tract 4004, Alameda County, California 82784.0 06 001 400400
4 Census Tract 4005, Alameda County, California 50156.0 06 001 400500
... ... ... ... ... ...
374 Census Tract 9819, Alameda County, California 173750.0 06 001 981900
375 Census Tract 9820, Alameda County, California 120893.0 06 001 982000
376 Census Tract 9821, Alameda County, California 4561.0 06 001 982100
377 Census Tract 9832, Alameda County, California 127273.0 06 001 983200
378 Census Tract 9900, Alameda County, California -666666666.0 06 001 990000

379 rows Γ— 5 columns

InΒ [51]:
# Join census data to tracts gdf
oak_tract_gdf = tract_clip.rename(columns={'TRACTCE': 'tract'})

oak_tract_census_gdf = oak_tract_gdf.merge(tract_df, on='tract')
oak_tract_census_gdf = oak_tract_census_gdf.rename(columns={'B06011_001E': 'Median_Income'})
oak_tract_census_gdf.set_index('tract')
oak_tract_census_gdf = oak_tract_census_gdf[oak_tract_census_gdf['Median_Income'] >= 0]
oak_tract_census_gdf
gv.tile_sources.OSM() * gv.Polygons(oak_tract_census_gdf)
Out[51]:
InΒ [52]:
# Download NAIP data per census tract

pc_catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1"
)
pc_catalog.title
Out[52]:
'Microsoft Planetary Computer STAC API'
InΒ [Β ]:
# Accumulate URLs for NAIP per census tract

naip_url_dfs = []

for index, row in oak_tract_census_gdf.iterrows():
    try:
        #print("Getting info for tract " + row.tract)
        row_geometry = row['geometry']
        naip_search = pc_catalog.search(
            collections=["naip"],
            intersects=shapely.to_geojson(row_geometry),
            datetime=f"2022"
        )
        for naip_item in naip_search.items():
            if naip_item:
                print("Getting info for tract " + naip_item.id)
                naip_url_dfs.append(
                    pd.DataFrame(dict(
                        tract=[row.tract],
                        tile_id=[naip_item.id],
                        url=[naip_item.assets['image'].href]
                    ))
                )
    except:
        print("Error occurred while getting info for tract " + row.tract)
        continue

# Merge results into a dataframe for analysis
naip_url_df = pd.concat(naip_url_dfs)
naip_url_df
InΒ [Β ]:
tract_ndvi_accumulator = []

for tract, naip_item_df in naip_url_df.groupby('tract'):
    print(tract)
    tract_ndvi_das = []
    for i, naip_url_s in naip_item_df.iterrows():
        # Open NAIP data array
        full_naip_vda = rxr.open_rasterio(naip_url_s.url, masked=True).squeeze()
        
        # Get census tract boundary
        boundary_gdf = oak_tract_census_gdf.to_crs(full_naip_vda.rio.crs)[oak_tract_census_gdf.tract==tract]
        
        # Clip NAIP data to tract boundary
        try:
            crop_naip_vda = full_naip_vda.rio.clip_box(
                *boundary_gdf.total_bounds)
            naip_vda = crop_naip_vda.rio.clip(boundary_gdf.geometry)
        except:
            print("error clipping NAIP for tract " + tract)
            continue
        
        # Compute NDVI
        tract_ndvi_das.append(
            (naip_vda.sel(band=4) - naip_vda.sel(band=1))
            / (naip_vda.sel(band=4) + naip_vda.sel(band=1))
        )
        #print(tract_ndvi_das)
    
    # Merge rasters
    if len(tract_ndvi_das)>1:
        tract_ndvi_da = rxrm.merge_arrays(tract_ndvi_das)
    else:
        tract_ndvi_da = tract_ndvi_das[0]

    # Calculate percent of cells with NDVI value above threshold
    tract_total_pixels = tract_ndvi_da.notnull().sum()
    threshold_pixels = tract_ndvi_da > .12
    threshold_pixels_sum = threshold_pixels.sum()
    threshold_pct = (
        threshold_pixels_sum / tract_total_pixels
        * 100
    )
    print(threshold_pct)
    
    tract_ndvi_accumulator.append(
        pd.DataFrame(dict(
            tract=[tract],
            naip_pct=[threshold_pct.item()]
        ))
    )
    
    tract_ndvi_df = pd.concat(tract_ndvi_accumulator)
InΒ [65]:
# Save NDVI analysis output to CSV file

csv_path = os.path.join(data_dir, 'oakland_ndvi_by_tract.csv')

if not os.path.exists(csv_path):
    print('copying results to csv ')    
    tract_ndvi_df.to_csv(csv_path, mode='a', index=False)
else:
    print("NDVI results csv already exists.")
copying results to csv 
InΒ [Β ]:
# Load in CSV results to dataframe
ndvi_df = pd.read_csv(csv_path)

# Remove all rows of NDVI results with values below zero
ndvi_df = ndvi_df[ndvi_df['naip_pct'] >= 0]
oak_tract_census_gdf['tract'] = oak_tract_census_gdf['tract'].astype('int64')

oak_tract_final = pd.merge(oak_tract_census_gdf, ndvi_df, on='tract')

oak_tract_final
InΒ [69]:
# QA/QC on values for analysis
layout_chi = hv.Layout()

income_min = oak_tract_final['Median_Income'].min()
income_max = oak_tract_final['Median_Income'].max()

print("Range of values in the 'median_income' column:")
print("Min:", income_min)
print("Max:", income_max)

ndvi_min = oak_tract_final['naip_pct'].min()
ndvi_max = oak_tract_final['naip_pct'].max()

print("Range of values in the NDVI pct column:")
print("Min:", ndvi_min)
print("Max:", ndvi_max)
Range of values in the 'median_income' column:
Min: 4561.0
Max: 173750.0
Range of values in the NDVI pct column:
Min: 1.2103743075253135
Max: 58.35590104804
InΒ [70]:
# Create cholorpleth plots to confirm results

med_income_plot = oak_tract_final.hvplot(c='Median_Income', 
                        cmap='Viridis', 
                        geo=True, 
                        tiles='CartoLight', 
                        width=400, 
                        height=400, 
                        title='Choropleth Plot - Median Income')

ndvi_plot = oak_tract_final.hvplot(c='naip_pct', 
                        cmap='Viridis', 
                        geo=True, 
                        tiles='CartoLight', 
                        width=400, 
                        height=400, 
                        title='Choropleth Plot - NDVI % above Threshold')
combined_plot = med_income_plot + ndvi_plot

combined_plot
Out[70]:
InΒ [72]:
#Transform median income values to the log of the values
oak_tract_final['log_median_income'] = np.log(oak_tract_final['Median_Income'])

oak_tract_final = oak_tract_final.set_index('tract')
oak_tract_final = oak_tract_final.rename(columns={'naip_pct': 'ndvi_pct'})
oak_tract_final['log_ndvi_pct'] = np.log(oak_tract_final['ndvi_pct'])

# Display a scatter matrix of my variables.
hvplot.scatter_matrix(
    oak_tract_final[['log_median_income', 'log_ndvi_pct']]
)
Out[72]:
InΒ [73]:
# Set up test and train data for OLR model

oak_tract_final[['log_median_income', 'log_ndvi_pct']]

X = oak_tract_final[['log_median_income']]
y= oak_tract_final[['log_ndvi_pct']]
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=42)
X_train, X_test, y_train, y_test

# Run linear regression model

linear_reg= linear_model.LinearRegression().fit(X_train, y_train)

y_measured = np.exp(y_test)
y_hat = np.exp(linear_reg.predict(X_test))

# Process results

test_df = y_test.copy()
test_df['predicted_ndvi_pct'] = y_hat
test_df['measured_ndvi_pct'] = y_measured
test_df

# Plot results
y_max = float(test_df.measured_ndvi_pct.max())
(
    test_df
    .hvplot.scatter(x='measured_ndvi_pct', y='predicted_ndvi_pct')
    .opts(aspect='equal', xlim=(0, y_max), ylim=(0, y_max), width=600, height=600)
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')
Out[73]:
InΒ [74]:
# Calculate the error between measured NDVI % and predicted

oak_tract_final['predicted_ndvi_pct'] = np.exp(linear_reg.predict(X))
oak_tract_final['error_ndvi_pct'] = (
    oak_tract_final['predicted_ndvi_pct'] 
    - oak_tract_final['ndvi_pct']
)

# Plot out the error in a chloropleth map

results_plot = oak_tract_final.hvplot(c='error_ndvi_pct', 
                        cmap='RdBu', 
                        geo=True, 
                        tiles='CartoLight', 
                        width=600, 
                        height=600, 
                        title='Choropleth Plot - Median Income')
results_plot
Out[74]:

Analysis SummaryΒΆ

We were looking at a possible relationship between median income and percent of NDVI above .12 per tract, assuming that as median income increased, NDVI above threshold would increase as well. However, we are seeing a large discrepancy in the predicted relationship in the eastern areas of Oakland. This area, known as the Oakland Hills is generally very affluent but also has a lot of vegetated cover. It appears that the range of income in the affluent sections may not be being represented by the census tract aggregation, thus the large error in the NDVI prediction. Areas in the central portion of Oakland actually appear to have little error in the predicted NDVI versus median income. This suggests that our analysis is doing a decent job of predicting the relationship for more urban parts of the city, but failing in the eastern Oakland Hills area.